Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / glimpse-2.1 / index / build_in.c next >

Wrap

C/C++ Source or Header | 1995-06-21 | 47.5 KB | 1,469 lines

/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */ /* ./glimpse/index/build_in.c */ /* -------------------------------------------------------------- build_index(): build an index list from a set of files. INPUT: a set of file names char *name_list[MAX_LIST]; a partition table in p_table[MAX_PARTITION]; OUTPUT: an index list; char *index_list; the index list is a char string as follows: each entry of the index list contains two parts: name and indices, where name is an ascii character string, and indices is a list of short integer. (unsigned char) We use newline as a 'record delimiter' (a 'record is logically a word associated with its indices), and WORD_END_MARK to separate a word from its list of indices (s.t. fscanf %s works). Since we restrict the max number of partitions to be 255. a byte is enough to represent the index value. Note that there cannot be a partition #ed '\n'. An example index list: (in logical view) this 12 19 \n is 9 17 12 18 19 \n an 7 12 \n example 16 \n -----------------------------------------------------------------------*/ #include <stdlib.h> #include "glimpse.h" #define debugt #define BINARY 1 /* #define SW_DEBUG the original sw output of index set */ /* This flag must always be defined: it is used only in build_in.c */ /* #define UDI_DEBUG the original outputs of each indexed file */ /* Some variables used throughout */ #if BG_DEBUG FILE *LOGFILE; /* file descriptor for LOG output */ #endif /*BG_DEBUG*/ FILE *STATFILE; /* file descriptor for statistical data about indexed files */ FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */ char INDEX_DIR[MAX_LINE_LEN]; struct stat istbuf; struct stat excstbuf; struct stat incstbuf; int ICurrentFileOffset; int NextICurrentFileOffset; /* Some options used throughout */ int OneFilePerBlock = OFF; extern int IndexNumber; extern int CountWords; extern int StructuredIndex; extern int InterpretSpecial; int total_size = 0; int MAXWORDSPERFILE = 0; int NUMERICWORDPERCENT = 50; int AddToIndex = OFF; int FastIndex = OFF; int BuildDictionary = OFF; int BuildDictionaryExisting = OFF; int CompressAfterBuild = OFF; int IncludeHigherPriority = OFF; int FilenamesOnStdin = OFF; int UseFilters = OFF; int ByteLevelIndex = OFF; /* int IndexUnderscore = OFF; */ int IndexableFile = OFF; int MAX_INDEX_PERCENT = DEF_MAX_INDEX_PERCENT; int MAX_PER_MB = DEF_MAX_PER_MB; int AddedMaxWordsMessage = OFF; int AddedMixedWordsMessage = OFF; int icount=0; /* count the number of my_malloc for indices structure */ int hash_icount=0; /* to see how much was added to the current hash table */ int save_icount=0; /* to see how much was added to the index by the current file */ int numeric_icount=0; /* to see how many numeric words were there in the current file */ int num_filter=0; int filter_len[MAX_FILTER]; CHAR *filter[MAX_FILTER]; CHAR *filter_command[MAX_FILTER]; int mask_int[32] = MASK_INT; char *name_list[MAX_LIST]; int disable_list[FILEMASK_SIZE]; int p_table[MAX_PARTITION]; int *size_list = NULL; /* temporary area to store size of each file */ int p_size_list[MAX_PARTITION]; /* sum of the sizes of the files in each partition */ int part_num = 1; /* number of partitions */ int memory_usage = 0; extern char *getword(); int file_num = 0; extern int attr_num; char * my_malloc(len) int len; { char *s; /* char *malloc(); declared in stdlib.h */ if ((s = (char *)malloc(len)) != NULL) memory_usage += len; else fprintf(stderr, "malloc failed after memory_usage = %x Bytes\n", memory_usage); /* Don't exit since might do traverse here: exit in glimpse though */ #if BG_DEBUG printf("m:%x ", memory_usage); #endif /*BG_DEBUG*/ return s; } my_free(ptr, size) void *ptr; int size; { free(ptr); memory_usage -= size; #if BG_DEBUG printf("f:%x ", memory_usage); #endif /*BG_DEBUG*/ } int bp=0; /* buffer pointer */ unsigned char word[MAX_WORD_BUF]; int FirstTraverse1 = ON; struct indices *ip; struct token *hash_table[MAX_64K_HASH+1]; build_index() { if (AddToIndex || FastIndex) { FirstTraverse1 = OFF; } build_hash(); traverse1(); return; } /* ---------------------------------------------------------------------- traverse() function: traverse the hash list of indices = a hash list is a array of linked list, where every node in a linked list contains a word whose hash_value is the same. While traversing the hash list, traverse() output a stream of index list. It also my_frees the memory used in hash_table. ------------------------------------------------------------------------*/ traverse() { int numseencount = 0; int numelements; int numonline; int i, j, k; struct token *tp, *tp_old; struct indices *ip, *ip_old; FILE *f_out; char s[256]; char *word; int x = -1, y=0, diff, temp, even_words=1; /* 0 is an even number */ #ifdef SW_DEBUG printf("in traverse()\n"); #endif sprintf(s, "%s/%s", INDEX_DIR, I2); if ((f_out = fopen(s, "w")) == NULL) { fprintf(stderr, "Cannot open %s for writing\n", s); exit(3); } for(i=0; i<MAX_64K_HASH; i++) { if(hash_table[i] == NULL) continue; tp = hash_table[i]; tp_old = tp; while(tp != NULL) { /* traverse the token list */ if (StructuredIndex) { /* force big-endian as usual */ k = encode32b(tp->attribute); putc((k&0xff000000)>>24, f_out); putc((k&0x00ff0000)>>16, f_out); putc((k&0x0000ff00)>>8, f_out); putc((k&0x000000ff), f_out); } word = tp->word; while(*word != '\0') { /* copy the word to output */ putc(*word++, f_out); } /* Look for stop lists */ if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) { putc(ALL_INDEX_MARK, f_out); putc(DONT_CONFUSE_SORT, f_out); goto next_token; } else if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) { putc(ALL_INDEX_MARK, f_out); putc(DONT_CONFUSE_SORT, f_out); goto next_token; } putc(WORD_END_MARK, f_out); numonline = 0; x = -1; y = 0; even_words = 1; ip = tp->ip; /* traverse the indices list */ ip_old = ip; numelements = 0; while(ip != NULL) { numelements ++; if (CountWords) { fprintf(f_out, "%d", ip->count); } else { if (ByteLevelIndex) { for (j=0; j<ip->count; j++) { if ((ip->offset[j] <= y) && (y > 0) && (x == ip->index[j])) { /* consecutive offsets not increasing in same file! */ fprintf(stderr, "ignoring (%d, %d) > (%d, %d)\n", x, y, ip->index[j], ip->offset[j]); continue; /* error! */ } if (numonline >= MAX_PER_LINE) { /* terminate current line since it is too late to put ALL_INDEX_MARK now ... Unfortunate since sort is screwedup */ putc('\n', f_out); #if 0 putc('\n', stdout); #endif /*0*/ if (StructuredIndex) { /* force big-endian as usual */ k = encode32b(tp->attribute); putc((k&0xff000000)>>24, f_out); putc((k&0x00ff0000)>>16, f_out); putc((k&0x0000ff00)>>8, f_out); putc((k&0x000000ff), f_out); } word = tp->word; while(*word != '\0') { /* copy the word to output */ putc(*word++, f_out); } putc(WORD_END_MARK, f_out); numonline = 0; x = -1; /* to force code below to output it as if it is a fresh file */ y = 0; /* must output first offset as is, rather than difference */ } if (x != ip->index[j]) { if (x != -1) { temp = encode8b(0); putc(temp, f_out); /* can never ordinarily happen since ICurrentFileOffset is always ++d => delimiter */ } if (file_num <= MaxNum8bPartition) { x = encode8b(ip->index[j]); putc(x&0x000000ff, f_out); } else { x = encode16b(ip->index[j]); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } x = ip->index[j]; /* for next round */ #if 0 printf("#######x=%d ", x); #endif /*0*/ y = 0; } diff = ip->offset[j] - y; y = ip->offset[j]; if (diff < MaxNum1BPartition) { temp = encode8b(diff); putc(temp, f_out); } else if (diff < MaxNum2BPartition) { temp = encode8b((diff/MaxNum8bPartition) | 0x40); putc(temp, f_out); temp = encode8b(diff % MaxNum8bPartition); putc(temp, f_out); } else if (diff < MaxNum3BPartition) { temp = encode8b((diff/MaxNum16bPartition) | 0x80); putc(temp, f_out); temp = encode16b(diff % MaxNum16bPartition); putc((temp & 0x0000ff00) >> 8, f_out); putc(temp & 0x000000ff, f_out); } else { temp = encode8b((diff/MaxNum24bPartition) | 0xc0); putc(temp, f_out); temp = encode24b(diff % MaxNum24bPartition); putc((temp & 0x00ff0000) >> 16, f_out); putc((temp & 0x0000ff00) >> 8, f_out); putc(temp & 0x000000ff, f_out); } numonline ++; } } /* ByteLevelIndex */ else if (OneFilePerBlock) { if (file_num <= MaxNum8bPartition) { for(j=0; j<ip->count; j++) { putc(encode8b(ip->index[j]), f_out); } } else if (file_num <= MaxNum12bPartition) { for(j=0; j<ip->count; j++) { x = encode12b(ip->index[j]); if (even_words) { putc(x & 0x000000ff, f_out); /* lsb */ y = (x & 0x00000f00)>>8; /* msb */ even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ putc(y, f_out); putc(x&0x000000ff, f_out); even_words = 1; } } } else if (file_num <= MaxNum16bPartition) { for(j=0; j<ip->count; j++) { x = encode16b(ip->index[j]); putc((x&0x0000ff00)>>8, f_out); putc(x&0x000000ff, f_out); } } } /* OneFilePerBlock */ else { /* normal partitions */ for(j=0; j<ip->count; j++) { putc(ip->index[j], f_out); } } } ip = ip->next_i; /* go to next indices */ my_free(ip_old, sizeof(struct indices)); ip_old = ip; } if (!ByteLevelIndex && OneFilePerBlock && !even_words && (file_num <= MaxNum12bPartition)) putc(y, f_out); next_token: if (putc('\n', f_out) == EOF) { fprintf(stderr, "Error: write failed at %s:%d\n", __FILE__, __LINE__); exit(2); } tp = tp->next_t; /* go to next token */ #if 0 fprintf(stderr, "numelements=%d\n", numelements); #endif /*0*/ #if BG_DEBUG memory_usage -= (strlen(tp_old->word) + 1); #endif /*BG_DEBUG*/ my_free(tp_old->word, 0); my_free(tp_old, sizeof(struct token)); tp_old = tp; numseencount ++; } } #if BG_DEBUG fprintf(stderr, "out of traverse(): saved/freed %d tokens: new usage: %d\n", numseencount, memory_usage); #endif fflush(f_out); fclose(f_out); } traverse1() { FILE *i1, *i2, *i3; int ret; char s[256]; char s1[MAX_LINE_LEN]; extern int errno; static int maxsortlinelen = 0; if (maxsortlinelen <= 0) { if (file_num < MaxNum8bPartition) maxsortlinelen = round((MaxNum8bPartition * sizeof(int) + MAX_NAME_SIZE), MAX_LINE_LEN) * MAX_LINE_LEN; else if (file_num < MaxNum12bPartition) maxsortlinelen = round((MaxNum12bPartition * sizeof(int) + MAX_NAME_SIZE), MAX_LINE_LEN) * MAX_LINE_LEN; else maxsortlinelen = MAX_SORTLINE_LEN; } traverse(); /* will produce .i2 and my_free allocated memory */ #if USESORTZOPTION sprintf(s, "sort -z %d %s/%s > %s/%s\n", maxsortlinelen, INDEX_DIR, I2, INDEX_DIR, O2); #else /*USESORTZOPTION*/ sprintf(s, "sort %s/%s > %s/%s\n", INDEX_DIR, I2, INDEX_DIR, O2); #endif /*USESORTZOPTION*/ #ifdef SW_DEBUG printf("%s", s); #endif if((ret=system(s)) != 0) { sprintf(s1, "system('%s') failed at:\n\t File=%s, Line=%d, Errno=%d", s, __FILE__, __LINE__, errno); perror(s1); fprintf(stderr, "Please run the program again (if there's no memory try increasing the swap area)\n"); exit(1); } #ifdef SW_DEBUG printf("mv .o2 .i2\n"); fflush(stdout); #endif sprintf(s, "mv %s/%s %s/%s\n", INDEX_DIR, O2, INDEX_DIR, I2); system(s); #if 0 printf("traversed\n"); sprintf(s, "head -10 %s/%s\n", INDEX_DIR, I2); system(s); #endif /*0*/ /* * This flag is set from outside iff build-fast | build-addto option is set. */ if(FirstTraverse1) { /* Mention whether numbers are indexed */ if(IndexNumber) sprintf(s, "echo %%1234567890 > %s/%s\n", INDEX_DIR, INDEX_FILE); else sprintf(s, "echo %% > %s/%s\n", INDEX_DIR, INDEX_FILE); system(s); /* Put the magic number: 0 if not 1file/blk, numfiles otherwise */ if (OneFilePerBlock) { if (ByteLevelIndex) sprintf(s, "echo %%-%d >> %s/%s\n", file_num, INDEX_DIR, INDEX_FILE); else sprintf(s, "echo %%%d >> %s/%s\n", file_num, INDEX_DIR, INDEX_FILE); } else sprintf(s, "echo %%0 >> %s/%s\n", INDEX_DIR, INDEX_FILE); system(s); /* Put the magic number: 0 if not structured index, 1 if so */ if (StructuredIndex) sprintf(s, "echo %%%d >> %s/%s\n", attr_num, INDEX_DIR, INDEX_FILE); else sprintf(s, "echo %%0 >> %s/%s\n", INDEX_DIR, INDEX_FILE); system(s); #ifdef SW_DEBUG sprintf(s, "ls -l %s/.glimpse*\n", INDEX_DIR); system(s); #endif sprintf(s, "cat %s/%s >> %s/%s\n", INDEX_DIR, I2, INDEX_DIR, INDEX_FILE); system(s); sprintf(s, "rm %s/%s\n", INDEX_DIR, I2); system(s); #ifdef SW_DEBUG sprintf(s, "ls -l %s/.glimpse*\n", INDEX_DIR); system(s); #endif #if 0 printf("catted\n"); sprintf(s, "head -10 %s/%s\n", INDEX_DIR, INDEX_FILE); system(s); #endif /*0*/ FirstTraverse1 = 0; return; } /* else not first-traverse */ sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE); if((i1 = fopen(s, "r")) == 0) { /* new stuff */ fprintf(stderr, "can't open %s for reading\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, I2); if((i2 = fopen(s, "r")) == 0) { /* old stuff */ fprintf(stderr, "can't open %s for reading\n", s); exit(2); } sprintf(s, "%s/%s", INDEX_DIR, I3); if((i3 = fopen(s, "w")) == 0) { /* result */ fprintf(stderr, "can't open %s for writing\n", s); exit(2); } /* Copy the 3 option fields (indexnumber, onefileperblock, structuredqueries) */ fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); fgets(s, 256, i1); s[255] = '\0'; fputs(s, i3); merge_in(i2, i1, i3); /* merge_in(i1, i2, i3); */ #ifdef BG_DEBUG fprintf(stderr, "out of merge_in()\n"); #endif /*BG_DEBUG*/ fclose(i1); fclose(i2); fclose(i3); #ifdef SW_DEBUG printf("mv .i3 %s\n", INDEX_FILE); fflush(stdout); #endif sprintf(s, "mv %s/%s %s/%s", INDEX_DIR, I3, INDEX_DIR, INDEX_FILE); system(s); #ifdef SW_DEBUG printf("ls -l .i2 %s\n", INDEX_FILE); fflush(stdout); sprintf(s, "ls -l %s/.glimpse*", INDEX_DIR); printf("%d\n", system(s)); #endif #if 0 printf("merged\n"); sprintf(s, "head -10 %s/%s\n", INDEX_DIR, INDEX_FILE); system(s); #endif /*0*/ } /* -------------------------------------------------------------------- build_hash(): input: a set of filenames in name_list[], a partition table p_table[] output: a hash table hash_table[]. -----------------------------------------------------------------------*/ build_hash() { FILE *fd; /* opened file number */ int i, pn; /* pn: current partition */ int num_read; char word[256]; struct stat stbuf; int offset; int toread; unsigned char *buffer; unsigned char *bx; unsigned char *buffer_end; int tried_once = 0; int attribute; int ret; char outname[MAX_LINE_LEN]; char *unlinkname = NULL; int pid = getpid(); if (StructuredIndex) region_initialize(); init_hash_table(); #ifdef debug printf("entering build_hash(), part_num=%d\n", part_num); #endif tried_once = 0; try_again_1: buffer = (unsigned char *) my_malloc(sizeof(char)* BLOCK_SIZE + 10); /* always read in units of BLOCK_SIZE or less */ if(buffer == NULL) { fprintf(stderr, "not enough memory in build_hash\n"); if (tried_once) return; traverse1(); init_hash_table(); tried_once = 1; goto try_again_1; } bx = buffer; if (OneFilePerBlock) { for(i=0; i<file_num; i++) { unlinkname = NULL; if (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))]) continue; if ((ret = tuncompress_file(name_list[i], outname, TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT)) > 0) { /* do not remove old .TZ file */ if (((fd = fopen(outname, "r")) == NULL) ) { remove_filename(i, -1); unlink(outname); continue; } /* not handling structured indices for compressed files now since offset computations will be incorrect */ unlinkname = outname; goto index_file1; } /* Try to apply the filter */ sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid); if ((ret = apply_filter(name_list[i], outname)) == 1) { /* Some pattern matched AND some filter was successful */ if (((fd = fopen(outname, "r")) == NULL) ) { /* error: shouldn't have returned 1! */ remove_filename(i, -1); unlink(outname); continue; } /* not handling structured indices for filtered files now since offset computations will be incorrect */ unlinkname = outname; goto index_file1; } else if (ret == 2) { /* Some pattern matched but no filter was successful */ if (filetype(name_list[i], 0)) { /* try to index input file if it satisfies filetype */ remove_filename(i, -1); unlink(outname); continue; } unlinkname = outname; } if (StructuredIndex && (-1 == region_create(name_list[i]))) { fprintf(stderr, "permission denied or non-existent file: %s\n", name_list[i]); remove_filename(i, -1); continue; } if (((fd = fopen(name_list[i], "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", name_list[i]); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ continue; } index_file1: #ifdef SW_DEBUG if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", name_list[i], i); printf("%s\n", name_list[i]); #endif /* stat(name_list[i], &stbuf); Chris Dalton */ fstat(fileno(fd), &stbuf); #ifdef SW_DEBUG printf("filesize: %d\n", stbuf.st_size); #endif #ifdef UDI_DEBUG printf("%s ", name_list[i]); printf("size: %d ", stbuf.st_size); #endif /* buffer always points to a BLOCK_SIZE block of allocated memory */ for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) { NextICurrentFileOffset = ICurrentFileOffset = offset; toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE; fseek(fd, offset, 0); bx= buffer; num_read = 0; while ((toread > 0) && ((num_read = fread(bx, 1, toread, fd)) < toread)) { if (num_read <= 0) { buffer = bx; fprintf(stderr, "read error on file %s at offset %d\n", name_list[i], offset); goto break_break1; /* C doesn't have break; break; */ } bx += num_read; toread -= num_read; } if (num_read >= toread) { bx += num_read; toread -= num_read; } buffer_end = bx; bx = buffer; /* buffer_end = buffer + toread; */ while ((buffer=(unsigned char *) getword(word, buffer, buffer_end, &attribute)) < buffer_end) { /* printf("%s\n", word); */ if(word[0] == '\0') continue; if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, i, attribute); } if (word[0] != '\0') { /* printf("%s\n", word); */ if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, i, attribute); } buffer = bx; } break_break1: fclose(fd); if (unlinkname != NULL) unlink(unlinkname); #ifdef UDI_DEBUG printf("add to index: %d\n",icount-save_icount); #endif if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) { fprintf(MESSAGEFILE, "%d words are contributed by %s\n", icount-save_icount, name_list[i]); AddedMaxWordsMessage = ON; } if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) { fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, name_list[i]); AddedMixedWordsMessage = ON; } numeric_icount=0; save_icount=icount; if (StructuredIndex) region_destroy(); } my_free(bx, BLOCK_SIZE); return; } for(pn=1; pn < part_num; pn++) /* partition # 0 is not accessed */ { if (pn == '\n') continue; /* There cannot be a partition # '\n' or 0: see partition.c */ for(i=p_table[pn]; i<p_table[pn+1]; i++) { unlinkname = NULL; if (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))]) continue; if (BuildDictionaryExisting) { if (((fd = fopen(name_list[i], "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", name_list[i]); remove_filename(i, -1); continue; } if (!CompressAfterBuild) unlinkname = name_list[i]; /* not needed anymore */ goto index_file2; } if ((ret = tuncompress_file(name_list[i], outname, TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT)) > 0) { /* do not remove old .TZ file */ if (((fd = fopen(outname, "r")) == NULL) ) { remove_filename(i, -1); unlink(outname); continue; } /* not handling structured indices for compressed files now since offset computations will be incorrect */ if (BuildDictionary && CompressAfterBuild) strcpy(name_list[i], outname); /* name of clear file will be smaller, so enough space */ else unlinkname = outname; goto index_file2; } /* Try to apply the filter */ sprintf(outname, "%s/.glimpse_apply.%d", INDEX_DIR, pid); if ((ret = apply_filter(name_list[i], outname)) == 1) { /* Some pattern matched AND some filter was successful */ if (((fd = fopen(outname, "r")) == NULL) ) { /* error: shouldn't have returned 1! */ remove_filename(i, -1); unlink(outname); continue; } /* not handling structured indices for filtered files now since offset computations will be incorrect */ unlinkname = outname; goto index_file2; } else if (ret == 2) { /* Some pattern matched but no filter was successful */ if (filetype(name_list[i], 0)) { /* try to index input file if it satisfies filetype */ remove_filename(i, -1); unlink(outname); continue; } unlinkname = outname; } if (StructuredIndex && (-1 == region_create(name_list[i]))) { fprintf(stderr, "permission denied or non-existent file: %s\n", name_list[i]); remove_filename(i, -1); continue; } if (((fd = fopen(name_list[i], "r")) == NULL) ) { fprintf(stderr, "permission denied or non-existent file: %s\n", name_list[i]); remove_filename(i, -1); if (StructuredIndex) region_destroy(); /* cannot happen! */ continue; } index_file2: #ifdef SW_DEBUG if (AddToIndex || FastIndex) printf("adding words of %s in %d\n", name_list[i], pn); printf("%s\n", name_list[i]); #endif /* stat(name_list[i], &stbuf); Chris Dalton */ fstat(fileno(fd), &stbuf); #ifdef SW_DEBUG printf("filesize: %d\n", stbuf.st_size); #endif #ifdef UDI_DEBUG printf("%s ", name_list[i]); printf("size: %d ", stbuf.st_size); #endif /* buffer always points to a BLOCK_SIZE block of allocated memory */ for (offset = 0; offset < stbuf.st_size; offset += BLOCK_SIZE) { NextICurrentFileOffset = ICurrentFileOffset = offset; toread = offset + BLOCK_SIZE >= stbuf.st_size ? stbuf.st_size - offset : BLOCK_SIZE; fseek(fd, offset, 0); bx= buffer; num_read = 0; while ((toread > 0) && ((num_read = fread(bx, 1, toread, fd)) < toread)) { if (num_read <= 0) { buffer = bx; fprintf(stderr, "read error on file %s at offset %d\n", name_list[i], offset); goto break_break2; /* C doesn't have break; break; */ } bx += num_read; toread -= num_read; } if (num_read >= toread) { bx += num_read; toread -= num_read; } buffer_end = bx; bx = buffer; /* buffer_end = buffer + toread; */ while ((buffer=(unsigned char *) getword(word, buffer, buffer_end, &attribute)) < buffer_end) { /* printf("%s\n", word); */ if(word[0] == '\0') continue; if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, pn, attribute); } if (word[0] != '\0') { /* printf("%s\n", word); */ if(icount - hash_icount >= I_THRESHOLD) { #if BG_DEBUG fprintf(LOGFILE, "reached I_THRESHOLD at %d\n", icount - hash_icount); #endif /*BG_DEBUG*/ traverse1(); init_hash_table(); hash_icount = icount; } insert_h(word, pn, attribute); } buffer = bx; } break_break2: fclose(fd); if (unlinkname != NULL) unlink(unlinkname); #ifdef UDI_DEBUG printf("add to index: %d\n",icount-save_icount); #endif if ((MAXWORDSPERFILE > 0) && (icount-save_icount > MAXWORDSPERFILE)) { fprintf(MESSAGEFILE, "%d words are contributed by %s\n", icount-save_icount, name_list[i]); AddedMaxWordsMessage = ON; } if (IndexNumber && NUMERICWORDPERCENT && (numeric_icount * 100 > (icount - save_icount) * NUMERICWORDPERCENT) && (icount - save_icount > MIN_WORDS)) { fprintf(MESSAGEFILE, "NUMBERS occur in %d%% of %d words contributed by %s\n", (numeric_icount * 100)/(icount - save_icount), icount - save_icount, name_list[i]); AddedMixedWordsMessage = ON; } numeric_icount=0; save_icount=icount; if (StructuredIndex) region_destroy(); } } my_free(bx, BLOCK_SIZE); } init_hash_table() { int i; for(i=0; i<MAX_64K_HASH; i++) hash_table[i] = (struct token *)NULL; hash_table[65535] = (struct token *)NULL; return; } /* ------------------------------------------------------------------------ input: a word (a string), a hash table (each entry points to a list of tokens. (a token is a structure containing 'word' and a pointer to a list of indices)). function: insert the word to appropriate position in the table. if the inserted word is already in the data structure, then update the list of indices corresponding to that 'word'. otherwise create a new token. ---------------------------------------------------------------------------*/ insert_h(word, pn, attribute) char *word; int pn; int attribute; { int hash_value=0; struct token *tp; struct token *tp_bak; struct indices *iip; int wordlen = strlen(word); int tried_once; /* all words with same attribute at same place in hash table */ hash_value = hash64k(word, wordlen); tp_bak = tp = hash_table[hash_value]; while(tp != NULL) { if((strcmp(word, tp->word) == 0) && (tp->attribute == attribute)) { insert_index(tp, pn); return; } tp_bak = tp; tp = tp->next_t; } /* this is a new word, insert it */ tried_once = 0; try_again_2: if((tp = (struct token *) my_malloc(sizeof(struct token))) == NULL) { tp_bak = NULL; fprintf(stderr, "not enough memory in insert_h1 at icount=%d. skipping...\n", icount); if (tried_once) return; traverse1(); init_hash_table(); tried_once = 1; goto try_again_2; } tried_once = 0; try_again_3: if((tp->word = (char *) my_malloc(sizeof(char) * (wordlen+1))) == NULL) { tp_bak = NULL; fprintf(stderr, "not enough memory in insert_h2 at icount=%d. skipping...\n", icount); if (tried_once) { my_free(tp, sizeof(struct token)); return; } traverse1(); init_hash_table(); tried_once = 1; goto try_again_3; } strcpy(tp->word, word); tp->attribute = attribute; /* the index list has a first index */ tried_once = 0; try_again_4: if((iip = (struct indices *) my_malloc(sizeof(struct indices))) == NULL) { tp_bak = NULL; fprintf(stderr, "not enough memory in insert_h3 at icount=%d. skipping...\n", icount); if (tried_once) { my_free(tp->word, wordlen + 1); my_free(tp, sizeof(struct token)); return; } traverse1(); init_hash_table(); tried_once = 1; goto try_again_4; } icount++; if (IndexNumber && NUMERICWORDPERCENT) { int i=0; while(word[i] != '\0') { if (!isalpha(word[i])) break; i++; } if (word[i] != '\0') numeric_icount ++; } #ifdef SW_DEBUG if((icount & 01777) == 0) printf("icount = %d\n", icount); #endif iip->count = 1; if (!CountWords) { iip->index[0] = pn; iip->offset[0] = ICurrentFileOffset; } /* assign both head and tail */ iip->next_i = NULL; tp->ip = iip; tp->lastip = iip; if(tp_bak == NULL) hash_table[hash_value] = tp; else tp_bak->next_t = tp; tp->next_t = NULL; tp->totalcount = 1; } /* ------------------------------------------------------------------- insert_index(): insert an index, i.e., pn, into an indices structure. The indices structure is a linked list where the 'first' one is always the active indices structure. When the active one is filled with 8 indices an indicies structure is created and becomes the active one. tp points to the token structure. so, tp->ip is always the active indices structure. ------------------------------------------------------------------- */ insert_index(tp, pn) struct token *tp; /* insert a index into a indices structure */ int pn; { struct indices *iip, *temp; struct indices *ip = (ByteLevelIndex ? tp->lastip : tp->ip); int tried_once; if (CountWords) { /* I am not interested in maintaining where a word occurs: only the number of times it occurs */ ip->count ++; return; } /* Check for stop-list */ if (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) return; if (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) return; if (ByteLevelIndex) { if ((ip->index[ip->count - 1] == pn) && (ip->offset[ip->count - 1] == ICurrentFileOffset)) return; /* in identical position */ } else if (ip->index[ip->count - 1] == pn) return; /* current word is not the first appearance in partition pn */ if(ip->count < 8) { ip->offset[ip->count] = ICurrentFileOffset; ip->index[ip->count++] = pn; return; } tried_once = 0; try_again_5: if((iip = (struct indices *) my_malloc(sizeof(struct indices)))==NULL) { fprintf(stderr, "not enough memory in insert_index at icount=%d. skipping...\n", icount); if (tried_once) return; traverse1(); init_hash_table(); tried_once = 1; goto try_again_5; } icount++; if (ByteLevelIndex) { /* insert at the end */ tp->lastip->next_i = iip; iip->next_i = NULL; tp->lastip = iip; } else { iip->next_i = tp->ip; tp->ip = iip; } iip->count = 1; iip->offset[0] = ICurrentFileOffset; iip->index[0] = pn; tp->totalcount ++; if ( (OneFilePerBlock && !ByteLevelIndex && (file_num > MaxNum8bPartition) && (tp->totalcount > (file_num * MAX_INDEX_PERCENT / 100))) || (ByteLevelIndex && (tp->totalcount > ( (((total_size>>20) > 0) && ((total_size>>20)*MAX_PER_MB < MAX_ALL_INDEX)) ? ((total_size>>20) * MAX_PER_MB) : DEF_ALL_INDEX) )) ) { for (iip=tp->ip; iip != NULL; temp = iip, iip = iip->next_i, my_free(temp, sizeof(struct indices))); tp->ip = NULL; /* never need to insert anything else here */ } /* printf("returning from insert_index()\n"); fflush(stderr); */ return; } /* ----------------------------------------------------------------- input: a word (a string of ascii character terminated by NULL) output: a hash_value of the input word. hash function: if the word has length <= 4 the hash value is just a concatenation of the last four bits of the characters. if the word has length > 4, then after the above operation, the hash value is updated by adding each remaining character. (and AND with the 16-bits mask). ---------------------------------------------------------------- */ hash64k(word, len) char *word; int len; { unsigned int hash_value=0; unsigned int mask_4=017; unsigned int mask_16=0177777; int i; if(len<=4) { for(i=0; i<len; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } } else { for(i=0; i<4; i++) { hash_value = (hash_value << 4) | (word[i]&mask_4); /* hash_value = hash_value & mask_16; */ } for(i=4; i<len; i++) hash_value = mask_16 & (hash_value + word[i]); } return(hash_value & mask_16); } /* Scan the indexed "word" from an index line: see io.c/merge_splits() */ scanword(word, buffer, buffer_end) unsigned char *word, *buffer, *buffer_end; { int i = MAX_WORD_SIZE; if (StructuredIndex) { word[0] = buffer[0]; word[1] = buffer[1]; word[2] = buffer[2]; word[3] = buffer[3]; buffer += 4; /* skip over 4B attribute field */ word += 4; } while ((i-- != 0) && (buffer <= buffer_end) && (*buffer != ALL_INDEX_MARK) && (*buffer != WORD_END_MARK) && (*buffer != '\n') && (*buffer != '\0')) *word ++ = *buffer ++; *word = '\0'; } /* Globals used in merge, and also in glimpse's main.c */ unsigned int src_index_set[REAL_PARTITION]; unsigned int dest_index_set[REAL_PARTITION]; unsigned char src_index_buf[REAL_INDEX_BUF]; unsigned char dest_index_buf[REAL_INDEX_BUF]; unsigned char merge_index_buf[REAL_INDEX_BUF]; /* merge index file f1 and f2, then put the result in index file f3 */ merge_in(f1, f2, f3) FILE *f1, *f2, *f3; { int src_mark, dest_mark; int src_num, dest_num; int src_end_pt, dest_end_pt; int cmp=0; /* the result of strcmp */ int bdx, bdx1, bdx2, merge_len, i, j; int TAIL1=0; char word1[MAX_WORD_SIZE+6]; /* used only for strcmp() */ char word2[MAX_WORD_SIZE+6]; /* used only for strcmp() */ int x=0, y=0, even_words = 1; /* LOOK OUT FOR: [memset, fgets, endpt-forloop, scanword] 4-tuples: invariant */ #if debug printf("in merge_in()\n"); fflush(stdout); #endif memset(dest_index_buf, '\0', REAL_INDEX_BUF); fgets(dest_index_buf, REAL_INDEX_BUF, f2); dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt); #ifdef debug printf("word2 = %s\n", word2); #endif memset(src_index_buf, '\0', REAL_INDEX_BUF); while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) { src_index_buf[REAL_INDEX_BUF - 1] = '\0'; src_end_pt = strlen(src_index_buf); scanword(word1, src_index_buf, src_index_buf+src_end_pt); #ifdef debug printf("word1 = %s\n", word1); #endif while((cmp = strncmp(word1, word2, MAX_WORD_SIZE+4)) > 0) { fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == NULL) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; TAIL1 = ON; break; } dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt); } if(TAIL1 == ON) break; if(cmp == 0) { /* we need to join the index of word1 and word2 */ #ifdef debug printf("joining src_index_buf and dest_index_buf\n"); printf("src_index_buf = %s", src_index_buf); printf("dest_index_buf = %s", dest_index_buf); #endif if (!CountWords && !ByteLevelIndex) { /* have to look for common indices and exclude them */ int oldbdx1, oldbdx2; merge_index_buf[0] = '\0'; merge_len = 0; if (StructuredIndex) oldbdx1 = bdx1 = 4; else oldbdx1 = bdx1 = 0; /* src_index_buf[src_end_pt] is '\0', src_index_buf[src_end_pt-1] is '\n' */ while((bdx1 < src_end_pt) && (src_index_buf[bdx1] != WORD_END_MARK) && (src_index_buf[bdx1] != ALL_INDEX_MARK)) bdx1 ++; if ((bdx1 > oldbdx1) && (bdx1 < src_end_pt)) { /* src_index_buf[bdx1] is the word-end-mark */ src_mark = src_index_buf[bdx1]; src_index_buf[bdx1] = '\0'; /* terminate word */ strcpy(merge_index_buf, src_index_buf); /* save the word itself */ merge_len = strlen(src_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ bdx1 ++; /* skip word end marker: src_index_buf[bdx1] is a partition# */ } even_words = 1; src_num = 0; if (OneFilePerBlock) memset((char *)src_index_set, '\0', round(file_num, 8)+2); else memset((char *)src_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx1 < src_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(src_index_buf[bdx1]); bdx1 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((src_index_buf[bdx1+1] & 0x0000000f) << 8) | src_index_buf[bdx1]; x = decode12b(x); bdx1 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((src_index_buf[bdx1-1] & 0x000000f0) << 4) | src_index_buf[bdx1]; x = decode12b(x); bdx1 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (src_index_buf[bdx1] << 8) | src_index_buf[bdx1+1]; x = decode16b(x); bdx1 += 2; } src_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; src_num ++; } else src_index_set[src_num++] = src_index_buf[bdx1++]; } if (StructuredIndex) oldbdx2 = bdx2 = 4; else oldbdx2 = bdx2 = 0; /* dest_index_buf[dest_end_pt] is '\0', dest_index_buf[dest_end_pt-1] is '\n' */ while((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != WORD_END_MARK) && (dest_index_buf[bdx2] != ALL_INDEX_MARK)) bdx2 ++; if ((bdx2 > oldbdx2) && (bdx2 < dest_end_pt)) { /* dest_index_buf[bdx2] is the word-end-mark */ dest_mark = dest_index_buf[bdx2]; dest_index_buf[bdx2] = '\0'; /* terminate word */ if (merge_len == 0) { strcpy(merge_index_buf, dest_index_buf); /* save the word itself */ merge_len = strlen(merge_index_buf); /* merge_index_buf[merge_len] is '\0', merge_index_buf[merge_len-1] is a part of the word */ } bdx2 ++; /* skip word end marker: dest_index_buf[bdx2] is a partition# */ } even_words = 1; dest_num = 0; if (OneFilePerBlock) memset((char *)dest_index_set, '\0', round(file_num, 8)+2); else memset((char *)dest_index_set, '\0', sizeof(int) * (MAX_PARTITION + 1)); while(bdx2 < dest_end_pt - 1) { if (OneFilePerBlock) { x = 0; if (file_num <= MaxNum8bPartition) { x = decode8b(dest_index_buf[bdx2]); bdx2 ++; } else if (file_num <= MaxNum12bPartition) { if (even_words) { x = ((dest_index_buf[bdx2+1] & 0x0000000f) << 8) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 += 2; even_words = 0; } else { /* odd number of words so far */ x = ((dest_index_buf[bdx2-1] & 0x000000f0) << 4) | dest_index_buf[bdx2]; x = decode12b(x); bdx2 ++; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = (dest_index_buf[bdx2] << 8) | dest_index_buf[bdx2+1]; x = decode16b(x); bdx2 += 2; } dest_index_set[block2index(x)] |= mask_int[x % (8*sizeof(int))]; dest_num ++; } else dest_index_set[dest_num++] = dest_index_buf[bdx2++]; } even_words = 1; if (merge_len > 0) { if(OneFilePerBlock && ((src_mark == ALL_INDEX_MARK) || (dest_mark == ALL_INDEX_MARK) || ((file_num > MaxNum8bPartition) && (src_num + dest_num > file_num*MAX_INDEX_PERCENT / 100)) )) { merge_index_buf[merge_len++] = ALL_INDEX_MARK; if (file_num <= MaxNum8bPartition) merge_index_buf[merge_len ++] = encode8b(DONT_CONFUSE_SORT); else if (file_num <= MaxNum12bPartition) { merge_index_buf[merge_len ++] = (encode12b(DONT_CONFUSE_SORT) & 0x00000f00) >> 8; merge_index_buf[merge_len ++] = encode12b(DONT_CONFUSE_SORT) & 0x000000ff; } else { merge_index_buf[merge_len ++] = (encode16b(DONT_CONFUSE_SORT) & 0x0000ff00) >> 8; merge_index_buf[merge_len ++] = encode16b(DONT_CONFUSE_SORT) & 0x000000ff; } goto final_merge; } merge_index_buf[merge_len++] = WORD_END_MARK; if (OneFilePerBlock) { for (i=0; i<round(file_num, 8*sizeof(int)); i++) dest_index_set[i] |= src_index_set[i]; /* take union */ for (i=0; i<round(file_num, 8*sizeof(int)); i++) if (dest_index_set[i]) for (j=0; j<8*sizeof(int); j++) if (dest_index_set[i] & mask_int[j]) { x = i*8*sizeof(int) + j; if (file_num <= MaxNum8bPartition) { merge_index_buf[merge_len++] = encode8b(x); } else if (file_num <= MaxNum12bPartition) { x = encode12b(x); if (even_words) { merge_index_buf[merge_len++] = x & 0x000000ff; /* lsb */ y = (x & 0x00000f00)>>8; /* msb */ even_words = 0; } else { /* odd number of words so far */ y |= (x&0x00000f00)>>4; /* msb of x into msb of y */ merge_index_buf[merge_len ++] = y; merge_index_buf[merge_len ++] = x&0x000000ff; even_words = 1; } } else if (file_num <= MaxNum16bPartition) { x = encode16b(x); merge_index_buf[merge_len ++] = (x&0x0000ff00)>>8; merge_index_buf[merge_len ++] = x&0x000000ff; } } if (!even_words && (file_num <= MaxNum12bPartition)) merge_index_buf[merge_len ++] = y; } else { /* normal indexing */ for (i=0; i<src_num; i++) { merge_index_buf[merge_len++] = src_index_set[i]; } for (j=0; j<dest_num; j++) { for (i=0; i<src_num; i++) if (dest_index_set[j] == src_index_set[i]) break; if (i>=src_num) /* did not find match */ merge_index_buf[merge_len++] = dest_index_set[j]; /* Doesn't matter if dest_index_set is int-array (merge_index_buf being char array) since dest_index_set has only a char */ } } final_merge: merge_index_buf[merge_len++] = '\n'; merge_index_buf[merge_len] = '\0'; fputs(merge_index_buf, f3); /* fprintf(stderr, "%d+%d=%d ", src_end_pt, dest_end_pt, merge_len); */ } /* merge_len > 0 */ } else if (CountWords) { /* indices are frequencies, so just merge them: OneFilPerBlock is ignored */ strcpy(merge_index_buf, src_index_buf); bdx = strlen(merge_index_buf); /* merge_index_buf[bdx] is '\0', merge_index_buf[bdx-1] is '\n' */ if (bdx > 1) bdx--; /* now merge_index_buf[bdx] is '\n', merge_index_buf[bdx-1] is the last index */ bdx2 = 0; /* find the first index */ if (IndexNumber) while(isalnum(dest_index_buf[bdx2])) bdx2 ++; else while(isalpha(dest_index_buf[bdx2])) bdx2++; /* to skip over the word-end marker of dest_index_buf (which is a blank) */ if (bdx2 > 0) bdx2 ++; if (bdx >= 1) { merge_index_buf[bdx++] = ' '; /* blank separated fscanf-able list of integers representing counts */ } /* append the indices of word1 to the buffer */ if (dest_index_buf[bdx2] > 0) { while(dest_index_buf[bdx2]>0) merge_index_buf[bdx++] = dest_index_buf[bdx2++]; /* '\n' gets copied */ merge_index_buf[bdx] = '\0'; } /* else, no need to copy anything */ fputs(merge_index_buf, f3); } else { /* indices are actual occurrences (ByteLevelIndex), so just cat them one after the other, src first since that is i2, the 1st one */ /* First put out the attribute and the word */ bdx1 = 0; if (StructuredIndex) { putc(src_index_buf[0], f3); putc(src_index_buf[1], f3); putc(src_index_buf[2], f3); putc(src_index_buf[3], f3); bdx1 = 4; } while ((bdx1<src_end_pt) && (src_index_buf[bdx1] != WORD_END_MARK) && (src_index_buf[bdx1] != ALL_INDEX_MARK) && (src_index_buf[bdx1] != '\n') && (src_index_buf[bdx1] != '\0')) putc(src_index_buf[bdx1 ++], f3); /* Now check what end-mark we should put */ if ((bdx1 >= src_end_pt) || (src_index_buf[bdx1] == ALL_INDEX_MARK) || (src_end_pt + dest_end_pt >= MAX_SORTLINE_LEN)) { putc(ALL_INDEX_MARK, f3); putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* dest can be all index mark */ if (StructuredIndex) bdx2 = 4; else bdx2 = 0; while ((bdx2<dest_end_pt) && (dest_index_buf[bdx2] != WORD_END_MARK) && (dest_index_buf[bdx2] != ALL_INDEX_MARK) && (dest_index_buf[bdx2] != '\n') && (dest_index_buf[bdx2] != '\0')) bdx2 ++; if ((bdx2 >= dest_end_pt) || (dest_index_buf[bdx2] == ALL_INDEX_MARK)) { putc(ALL_INDEX_MARK, f3); putc(DONT_CONFUSE_SORT, f3); putc('\n', f3); } else { /* we have to put out both the lists */ putc(WORD_END_MARK, f3); bdx1 ++; /* skip over WORD_END_MARK */ while ((bdx1 < src_end_pt) && (src_index_buf[bdx1] != '\n') && (src_index_buf[bdx1] != '\0')) putc(src_index_buf[bdx1++], f3); fputc(encode8b(0), f3); /* instead of the '\n' after end of src_index_buf */ bdx2 ++; /* skip over WORD_END_MARK */ while ((bdx2 < dest_end_pt) && (dest_index_buf[bdx2] != '\n') && (dest_index_buf[bdx2] != '\0')) putc(dest_index_buf[bdx2++], f3); putc('\n', f3); } } } #if debug printf("merge_index_buf = %s", merge_index_buf); #endif /*debug*/ memset(dest_index_buf, '\0', dest_end_pt+2); if(fgets(dest_index_buf, REAL_INDEX_BUF, f2) == 0) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; TAIL1 = ON; break; } dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); scanword(word2, dest_index_buf, dest_index_buf+dest_end_pt); } else { /* word1 < word2, so output src_index_buf */ fputs(src_index_buf, f3); } memset(src_index_buf, '\0', src_end_pt+2); } if(TAIL1) { if(cmp != 0) fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); while(fgets(src_index_buf, REAL_INDEX_BUF, f1)) { src_index_buf[REAL_INDEX_BUF - 1] = '\0'; src_end_pt = strlen(src_index_buf); fputs(src_index_buf, f3); memset(src_index_buf, '\0', src_end_pt+2); } } else { /* output the tail of f2 */ fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); while(fgets(dest_index_buf, REAL_INDEX_BUF, f2)) { dest_index_buf[REAL_INDEX_BUF - 1] = '\0'; dest_end_pt = strlen(dest_index_buf); fputs(dest_index_buf, f3); memset(dest_index_buf, '\0', dest_end_pt+2); } } return; } remove_filename(fileindex, new_partition) int fileindex, new_partition; { if ((fileindex < 0) || (fileindex >= MAXNUM_FILE)) return; #if BG_DEBUG fprintf(LOGFILE, "removing %s from index\n", name_list[fileindex]); memory_usage -= (strlen(name_list[fileindex]) + 2); #endif /*BG_DEBUG*/ my_free(name_list[fileindex], 0); name_list[fileindex] = NULL; disable_list[block2index(fileindex)] |= mask_int[fileindex % (8*sizeof(int))]; }